{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "993a2a24-1a58-42be-8034-6d116fb8d786",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
    "import os\n",
    "import re\n",
    "import math\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "import random\n",
    "from dotenv import load_dotenv\n",
    "from huggingface_hub import login\n",
    "import numpy as np\n",
    "import pickle\n",
    "from sentence_transformers import SentenceTransformer\n",
    "from datasets import load_dataset\n",
    "import chromadb\n",
    "from items import Item\n",
    "from sklearn.manifold import TSNE\n",
    "import plotly.graph_objects as go"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2359ccc0-dbf2-4b1e-9473-e472b32f548b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# environment\n",
    "\n",
    "load_dotenv(override=True)\n",
    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
    "os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')\n",
    "DB = \"products_vectorstore\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "645167e6-cf0d-42d2-949f-1089a25a2841",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Log in to HuggingFace\n",
    "\n",
    "hf_token = os.environ['HF_TOKEN']\n",
    "login(hf_token, add_to_git_credential=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "688bd995-ec3e-43cd-8179-7fe14b275877",
   "metadata": {},
   "outputs": [],
   "source": [
    "# With train.pkl in this folder\n",
    "with open('train.pkl', 'rb') as file:\n",
    "    train = pickle.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4aab95e-d719-4476-b6e7-e248120df25a",
   "metadata": {},
   "outputs": [],
   "source": [
    "client = chromadb.PersistentClient(path=DB)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f95dafd-ab80-464e-ba8a-dec7a2424780",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if the collection exists and delete it if it does\n",
    "collection_name = \"products\"\n",
    "existing_collection_names = [collection.name for collection in client.list_collections()]\n",
    "if collection_name in existing_collection_names:\n",
    "    client.delete_collection(collection_name)\n",
    "    print(f\"Deleted existing collection: {collection_name}\")\n",
    "\n",
    "collection = client.create_collection(collection_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a87db200-d19d-44bf-acbd-15c45c70f5c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b23a025-4c35-4d3a-96ad-b956cad37b0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pass in a list of texts, get back a numpy array of vectors\n",
    "vector = model.encode([\"Well hi there\"])[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8adde63f-e732-4f7c-bba9-f8b2a469f14e",
   "metadata": {},
   "outputs": [],
   "source": [
    "vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38de1bf8-c9b5-45b4-9f4b-86af93b3f80d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def description(item):\n",
    "    text = item.prompt.replace(\"How much does this cost to the nearest dollar?\\n\\n\", \"\")\n",
    "    return text.split(\"\\n\\nPrice is $\")[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c1205bd-4692-44ef-8ea4-69f255354537",
   "metadata": {},
   "outputs": [],
   "source": [
    "description(train[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c79e2fe-1f50-4ebf-9a93-34f3088f2996",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in tqdm(range(0, len(train), 1000)):\n",
    "    documents = [description(item) for item in train[i: i+1000]]\n",
    "    vectors = model.encode(documents).astype(float).tolist()\n",
    "    metadatas = [{\"category\": item.category, \"price\": item.price} for item in train[i: i+1000]]\n",
    "    ids = [f\"doc_{j}\" for j in range(i, i+1000)]\n",
    "    collection.add(\n",
    "        ids=ids,\n",
    "        documents=documents,\n",
    "        embeddings=vectors,\n",
    "        metadatas=metadatas\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a9395db-7bc9-47f9-902f-af8d380c9c09",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "745f73d9-f1a6-4e9f-96d9-1c38a1dd7559",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
